clear
set more off
cap log close
set logtype text

***** CHANGE TO OWN HOME DIRECTORY
global homedir "~/Dropbox/intgmob_AssortativeMating"

**** 
global maindir "$homedir/ReplicationPackage/Appendix_A"
global maintextdatdir "$homedir/ReplicationPackage/intermediatedata"
global rawdir "$maindir/raw_data"
global tempdir "$maindir/temp_data"
global datdir "$maindir/generated_data"
global outputdir "$maindir/output"


** Use cleaned 1880 fathers file from main text replication package

use  "$maintextdatdir/1880_fathers.dta", clear

keep if relate==3 & age>=0 & age<=15 & noname==0 & race==1

rename logoccscore_80_father logoccscore_father_mean

summ logoccscore_father_mean if sex==1
global sigmay_all_m_1880 = r(sd)

summ logoccscore_father_mean if sex==2
global sigmay_all_f_1880 = r(sd)

collapse (mean) logoccscore_father_mean, by(sex first)
save "$tempdir/intgmob_means_1880.dta", replace

use "$rawdir/linked_1880-1910_males.dta", clear

egen test = tag(serial_2 pernum_2)
keep if test==1

	*** NAMES
	*** first name 
	gen str first=word(namefrst_2,1)
	replace first=subinstr(first, ".", "", .)
	replace first=trim(first)
	replace first=proper(first)
	
	*** obvious abbreviations (not nicknames)
	replace first="William" if first=="Wm"
	replace first="George" if first=="Geo"
	replace first="Charles" if first=="Chas"
	replace first="Daniel" if first=="Danl"
	replace first="James" if first=="Jas"
	replace first="Joseph" if first=="Jos"
	replace first="Robert" if first=="Robt"
	replace first="Richard" if first=="Richd"
	replace first="Samuel" if first=="Saml"
	replace first="Thomas" if first=="Thos"
	replace first="Frederick" if first=="Fredk"
	replace first="Frederick" if first=="Fred'K" 
	replace first="John" if first=="Jno" 
	replace first="Samuel" if first=="Sam'L"  
	replace first="Thomas" if first=="Tho" 
	replace first="Michael" if first=="Michl"

	gen noname=(strpos(first, "?")>0 | strpos(first, "%")>0 | strpos(first, "-")>0 | strpos(first, "!")>0 | strpos(first, "*")>0)
	replace noname=1 if missing(first)
	
	gen key_obs = 1 if age_2>=30 & age_2<=45 & linktype==0 & relate_1==3 & sploc_2==pernum_2 + 1 & race_2==1

	gsort serial_2 pernum_2
	assert (sploc_2==pernum_2[_n +1]) | key_obs!=1
	count if key_obs==1 
	gen spouse_first = first[_n+1] if key_obs==1 
	gen spouse_sex = sex_2[_n+1] if key_obs==1 
	gen spouse_age = age_2[_n+1] if key_obs==1
	tab spouse_sex
	replace key_obs = . if spouse_sex==1
	replace key_obs = . if spouse_age<30 | spouse_age>45
	
	gen x = log(occscore_1) if relate_1==1
	egen logoccscore_father = mean(x), by(serial_1)
	
	gen sex = sex_2
	mer m:1 first sex using "$tempdir/intgmob_means_1880.dta"
	drop if _merge==2
	drop _merge
	
	rename logoccscore_father_mean logoccscore_father_mean_husb
	
	drop sex
	rename first first_husb
	
	rename spouse_sex sex
	rename spouse_first first
	
	mer m:1 first sex using "$tempdir/intgmob_means_1880.dta"
	drop if _merge==2
	drop _merge
	
	rename logoccscore_father_mean logoccscore_father_mean_wife
	
	rename sex spouse_sex
	rename first spouse_first
	
	rename first_husb first
	
	corr logoccscore_father logoccscore_father_mean_wife if key_obs==1
	global rho1_1880 = r(rho)
	corr logoccscore_father_mean_husb logoccscore_father_mean_wife if key_obs==1
	global rho2_linked_1880 = r(rho)
	
use $maintextdatdir/panel_trends_30yr_income.dta, clear

keep if race==1 & spouserace == 1	

gen samp= year2==1910 & sex==1
corr logoccscore_father logoccscore_father_spouse if samp==1
global rho2_1880 = r(rho)


*********** 1850-1880 sample


use  $maintextdatdir/1850_fathers.dta, clear

keep if relate==3 & age>=0 & age<=15 & noname==0 & race==1

rename logoccscore_50_father logoccscore_father_mean

summ logoccscore_father_mean if sex==1
global sigmay_all_m_1850 = r(sd)

summ logoccscore_father_mean if sex==2
global sigmay_all_f_1850 = r(sd)

collapse (mean) logoccscore_father_mean, by(sex first)
save "$tempdir/intgmob_means_1850.dta", replace

use "$rawdir/linked_1850-1880_males.dta", clear

egen test = tag(serial_2 pernum_2)
keep if test==1

	*** NAMES
	*** first name 
	gen str first=word(namefrst_2,1)
	replace first=subinstr(first, ".", "", .)
	replace first=trim(first)
	replace first=proper(first)
	
	*** obvious abbreviations (not nicknames)
	replace first="William" if first=="Wm"
	replace first="George" if first=="Geo"
	replace first="Charles" if first=="Chas"
	replace first="Daniel" if first=="Danl"
	replace first="James" if first=="Jas"
	replace first="Joseph" if first=="Jos"
	replace first="Robert" if first=="Robt"
	replace first="Richard" if first=="Richd"
	replace first="Samuel" if first=="Saml"
	replace first="Thomas" if first=="Thos"
	replace first="Frederick" if first=="Fredk"
	replace first="Frederick" if first=="Fred'K" 
	replace first="John" if first=="Jno" 
	replace first="Samuel" if first=="Sam'L"  
	replace first="Thomas" if first=="Tho" 
	replace first="Michael" if first=="Michl"

	gen noname=(strpos(first, "?")>0 | strpos(first, "%")>0 | strpos(first, "-")>0 | strpos(first, "!")>0 | strpos(first, "*")>0)
	replace noname=1 if missing(first)
	
	gen key_obs = 1 if age_2>=30 & age_2<=45 & linktype==0 & relate_1==3 & sploc_2==pernum_2 + 1 & race_2==1

	gsort serial_2 pernum_2
	assert (sploc_2==pernum_2[_n +1]) | key_obs!=1
	count if key_obs==1 
	gen spouse_first = first[_n+1] if key_obs==1 
	gen spouse_sex = sex_2[_n+1] if key_obs==1 
	gen spouse_age = age_2[_n+1] if key_obs==1
	tab spouse_sex
	replace key_obs = . if spouse_sex==1
	replace key_obs = . if spouse_age<30 | spouse_age>45
	
	gen x = log(occscore_1) if relate_1==1
	egen logoccscore_father = mean(x), by(serial_1)
	
	gen sex = sex_2
	mer m:1 first sex using "$tempdir/intgmob_means_1850.dta"
	drop if _merge==2
	drop _merge
	
	rename logoccscore_father_mean logoccscore_father_mean_husb
	
	drop sex
	rename first first_husb
	
	rename spouse_sex sex
	rename spouse_first first
	
	mer m:1 first sex using "$tempdir/intgmob_means_1850.dta"
	drop if _merge==2
	drop _merge
	
	rename logoccscore_father_mean logoccscore_father_mean_wife
	
	rename sex spouse_sex
	rename first spouse_first
	
	rename first_husb first
	
	corr logoccscore_father logoccscore_father_mean_wife if key_obs==1
	global rho1_1850 = r(rho)
	corr logoccscore_father_mean_husb logoccscore_father_mean_wife if key_obs==1
	global rho2_linked_1850 = r(rho)
	

use "$maintextdatdir/panel_trends_30yr_income.dta", clear

keep if race==1 & spouserace == 1	

gen samp= year2==1880 & sex==1
corr logoccscore_father logoccscore_father_spouse if samp==1
global rho2_1850 = r(rho)


log using "$outputdir/tableA-1.txt", replace

****RESULTS 1850-1880

di "Actual correlation coefficient is $rho1_1850"
di "Pseudo correlation coefficient in linked data is $rho2_linked_1850"
di "Pseudo correlation coefficient in full dataset is $rho2_1850"

***Estimated chi is attenuation factor estimated from linked data

di $rho2_linked_1850/$rho1_1850


****RESULTS 1880-1910

di "Actual correlation coefficient is $rho1_1880"
di "Pseudo correlation coefficient in linked data is $rho2_linked_1880"
di "Pseudo correlation coefficient in full dataset is $rho2_1880"

***Estimated chi is attenuation factor estimated from linked data

di $rho2_linked_1880/$rho1_1880


log close

erase "$tempdir/intgmob_means_1880.dta"
erase "$tempdir/intgmob_means_1850.dta"
